# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pd.set_option('display.max_rows', None)
# Opening and viewing the dataset
df = pd.read_csv('Top YouTube Channels Data .csv')
df.head()
| rank | youtuber | subscribers | video views | video count | category | started | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | T-Series | 213000000 | 188,073,919,029 | 16708.0 | Music | 2006 |
| 1 | 2 | YouTube Movies | 150000000 | 167,122,746,349 | NaN | Film & Animation | 2015 |
| 2 | 3 | Cocomelon - Nursery Rhymes | 133000000 | 126,822,520,940 | 751.0 | Education | 2006 |
| 3 | 4 | SET India | 131000000 | 101,541,977,714 | 78334.0 | Shows | 2006 |
| 4 | 5 | Music | 116000000 | 78,437,871,689 | NaN | Music | 2013 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rank 100 non-null int64 1 youtuber 100 non-null object 2 subscribers 100 non-null int64 3 video views 100 non-null object 4 video count 95 non-null float64 5 category 100 non-null object 6 started 100 non-null int64 dtypes: float64(1), int64(3), object(3) memory usage: 5.6+ KB
df.describe()
| rank | subscribers | video count | started | |
|---|---|---|---|---|
| count | 100.000000 | 1.000000e+02 | 95.000000 | 100.000000 |
| mean | 50.500000 | 5.336300e+07 | 15847.221053 | 2010.800000 |
| std | 29.011492 | 2.869713e+07 | 40955.200388 | 5.504819 |
| min | 1.000000 | 3.220000e+07 | 45.000000 | 1970.000000 |
| 25% | 25.750000 | 3.620000e+07 | 393.500000 | 2007.750000 |
| 50% | 50.500000 | 4.320000e+07 | 1139.000000 | 2012.000000 |
| 75% | 75.250000 | 5.710000e+07 | 4986.000000 | 2014.000000 |
| max | 100.000000 | 2.130000e+08 | 209351.000000 | 2018.000000 |
df.shape
(100, 7)
df.isnull().sum()
rank 0 youtuber 0 subscribers 0 video views 0 video count 5 category 0 started 0 dtype: int64
# filling in null values with 0
df = df.fillna(0)
# video views was not the correct format
df['video views '] = df['video views '].astype(str).str.replace(',','')
df['video views '] = df['video views '] = df['video views '].astype('int64')
year=df['started '].value_counts()
plt.figure(figsize=(20,8))
sns.pointplot(x=year.index,y=year.values, color='blue')
plt.xlabel('Year')
plt.ylabel('Count')
plt.grid()
plt.title('Started year of youtube channels',size=30, color='blue')
plt.show()
plt.figure(figsize=(20,8))
sns.countplot(x=df['category '], order=df['category '].value_counts().index, color='green')
plt.xticks(rotation=90)
plt.title('Category Count', size=30, color='green')
plt.show()
category = df['category '].value_counts()
plt.figure(figsize=(20,8))
sns.barplot(x=category.index,y=category.values, color='red')
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Top Categories by Youtubers',size=30, color='red')
plt.show()
category_view = df.groupby('category ')['video views '].sum().reset_index().sort_values(by='video views ', ascending=False)
plt.figure(figsize=(20,8))
sns.barplot(data=category_view, x='category ', y='video views ', color='purple')
plt.title('Which Categories get the most total views (1.0 = 1 trillion)',size=30, color='purple')
plt.show()
# Total Video made for each category
cat_vid = df.groupby('category ')['video count '].sum().reset_index().sort_values(by='video count ', ascending=False)
plt.figure(figsize=(20,8))
sns.barplot(x=cat_vid['category '],y=cat_vid['video count '], color='orange')
plt.xlabel('Category')
plt.xticks(rotation=45)
plt.ylabel('Total Number of Videos')
plt.title('Total Number of Videos made for each category',size=30, color='orange')
plt.show()
year_mean=df.groupby('started ').mean().reset_index()
year_mean
def pltplot(data, xcol, ycol, color, ax, title):
sns.pointplot(data=data, x=xcol, y=ycol, color=color, ax=ax).set_title(title, size=20)
fig, ((ax1),(ax2),(ax3))=plt.subplots(ncols=1, nrows=3)
fig.set_size_inches(20,20)
fig.tight_layout(pad=3.0)
pltplot(year_mean,'started ','subscribers ','lightcoral', ax1,'Subscribers per Year (mean)')
pltplot(year_mean,'started ','video views ','green', ax2,'Video views per Year (mean)')
pltplot(year_mean,'started ','video count ','gold', ax3,'Video count per Year (mean)')
fig=px.sunburst(df, path=['category ', 'youtuber'])
fig.update_layout(autosize=False, width=800, height=800)
fig.update_layout(title={'text': '<b>Youtube Channels with its Genres</b>'})
fig.show()
fig = px.bar(df, x='youtuber', y='video count ', height=700, width=1500).update_xaxes(categoryorder='total descending')
fig.update_layout(title={'text': '<b>Youtube Channels with the most videos published</b>'})
fig.update_xaxes(tickmode='linear')
fig.show()
fig = px.bar(df, x='youtuber', y='video views ', height=700, width=1500).update_xaxes(categoryorder='total descending')
fig.update_layout(title={'text': '<b>Youtube Channels with the most videos viewed</b>'})
fig.update_xaxes(tickmode='linear')
fig.show()